
qui {

/**********************************************************************************/
/* program masala_merge : Fuzzy match using masalafied levenshtein                   */
/*stata presumably starts with two files, with an identifier and a string to match.
e.g. state_dist_id bank_village_name
     state_dist_id pc01_village_name 


current method.
1. stata outsheets two files, with an id and a name column.
2. python gets filenames from command lines, reads two files into two dictionaries.
2a. add max distance to python command line
3. python outputs a single file, with id, str1, str2, distance
4. stata reads this file and processes

can wrap 1, 4 in a stata calling function. this seems like a good approach.


VERSION HISTORY
- 2013/09/16: renames text1, text2 -> varname_master, varname_using

*/
/***********************************************************************************/
cap prog drop masala_merge
prog def masala_merge
{
  syntax varlist using/, S1(string) OUTfile(string) [DIST(integer 5)]
  disp "THIS IS OBSOLETE, YOU SHOULD BE USING masala_merge2, WHICH IS SMARTER AND FRIENDLIER"
  
  // masala_merge state_id district_id using /tmp/pn/foo.dta, S1(village_name) OUTfile(string) [DIST(integer 5)]

  /* make everything quiet until python gets called -- it's not helpful */
  qui {

    /* store master filename */
    local master $S_FN
    
    /* create temporary files to be used */
  
    /* create a random 4 digit number to make the temp files unique */
    local nonce = floor(uniform() * 10000)
    
    local src1 $tmp/src1_`nonce'.txt
    local src2 $tmp/src2_`nonce'.txt
    local out $tmp/out_`nonce'.txt
    local lev_groups $tmp/lev_groups_`nonce'.dta
    
    preserve
    
    keep `varlist' `s1'
    sort `varlist' `s1'
    
    /* merge two datasets on ids to produce group names */
    merge m:m `varlist' using `using', keepusing(`varlist' `s1')
    
    // generate id groups
    egen g = group(`varlist')
    drop if mi(g)
    
    qui sum g
    local num_groups = r(max)
            
    // save group list
    keep g `varlist'
    duplicates drop
    save "`lev_groups'", replace
    
    /* now prepare group 1 */
    restore
    preserve
    
    keep `varlist' `s1'
  
    /* drop if missing string and store # observations */
    keep if !mi(`s1')
    qui count
    local g1_count = r(N)
    
    /* bring in group identifiers */
    merge m:1 `varlist' using "`lev_groups'", keepusing(g)
  
    /* places with missing ids won't match group */
    drop if _m == 1
  
    /* only keep matches */
    keep if _m == 3
    duplicates drop
    
    // outsheet string group 1
    outsheet g `s1' using "`src1'", comma replace nonames
    
    // prepare group2
    di "opening `using'..."
    use `using', clear
    keep `varlist' `s1'
  
    /* drop if missing string and store # observations */
    keep if !mi(`s1')
    qui count
    local g2_count = r(N)
    
    // merge in group identifiers
    merge m:1 `varlist' using "`lev_groups'", keepusing(g)
    
    /* something wrong if didn't match group ids for any observation */
    drop if _m == 1
  
    /* only keep matches */
    keep if _m == 3
    duplicates drop
    
    // outsheet string group 2
    outsheet g `s1' using "`src2'", comma replace nonames
  }
  
  // call python levenshtein program
  di "Matching `g1_count' strings to `g2_count' strings in `num_groups' groups."
  di "Calling lev.py:"

  di `" shell python -u ${path_code}\merge\lev.py -d `dist' -1 "`src1'" -2 "`src2'" -o "`out'" "'
  !python "${path_code}\merge\lev.py" -d `dist' -1 "`src1'" -2 "`src2'" -o "`out'"

  di "lev.py finished."

  /* quietly process the python output */
  qui {
    /* open output lev dataset */
    /* take care, this generates an error if zero matches */
    capture insheet using "`out'", comma nonames clear
  
    /* if there are zero matches, create an empty outfile and we're done */
    if _rc {
      disp "WARNING: masala_merge: There were no matches. Empty output file will be saved."
      clear
      save `outfile', replace emptyok
      exit
    }
    ren v1 g
    ren v2 `s1'_master
    ren v3 `s1'_using
    ren v4 lev_dist
  
    /* merge group identifiers back in */
    destring g, replace
    merge m:1 g using "`lev_groups'", keepusing(`varlist')
    
    /* _m == 1 would imply that our match list has groups not in the initial set */
    assert _m != 1
  
    /* _m == 2 are groups with zero matches. drop them */
    drop if _m == 2
  
    /* count specificity of each match */
    bys g `s1'_master: egen master_matches = count(g)
    bys g `s1'_using: egen using_matches = count(g)
  
    /* count distance to second best match */
  
    /* calculate best match for each var */
    foreach v in master using {
      bys g `s1'_`v': egen `v'_dist_rank = rank(lev_dist), unique
      
      gen tmp = lev_dist if `v'_dist_rank == 1
      bys g `s1'_`v': egen `v'_dist_best = max(tmp)
      drop tmp
      gen tmp = lev_dist if `v'_dist_rank == 2
      bys g `s1'_`v': egen `v'_dist_second = max(tmp)
      drop tmp
      
      drop `v'_dist_rank
    }
    
    drop g _m
  
  
    /* apply optimal matching rule (from pc9101 data in ~/iecmerge/include/calibrate_fuzzy.do) */
    /* initialize */
    gen keep_master = 1
    gen keep_using = 1
  
    /* get mean length of matched string */
    gen length = floor(0.5 * (length(`s1'_master) + length(`s1'_using)))
  
    /* 1. drop matches with too high a levenshtein distance (threshold is a function of length) */
    replace keep_master = 0 if lev_dist > 0.9 & length <= 4
    replace keep_master = 0 if lev_dist > 1.0 & length <= 5
    replace keep_master = 0 if lev_dist > 1.3 & length <= 8
    replace keep_master = 0 if lev_dist > 1.4 & inrange(length, 9, 14)
    replace keep_master = 0 if lev_dist > 1.8 & inrange(length, 15, 17)
    replace keep_master = 0 if lev_dist > 2.1
    
    /* copy these thresholds to keep_using */
    replace keep_using = 0 if keep_master == 0
  
    /* 2. never use a match that is not the best match */
    replace keep_master = 0 if (lev_dist > master_dist_best) & !mi(lev_dist)
    replace keep_using = 0 if (lev_dist > using_dist_best) & !mi(lev_dist)
    
    /* 3. apply best empirical safety margin rule */
    replace keep_master = 0 if (master_dist_second - master_dist_best) < (0.4 + 0.25 * lev_dist)
    replace keep_using = 0 if (using_dist_second - using_dist_best) < (0.4 + 0.25 * lev_dist)
  
    /* save over output file */
    order `varlist' `s1'_master `s1'_using lev_dist keep_master keep_using master_* using_*
    save `outfile', replace
  }

  /* report conclusions. */
  di "Merged text fields saved in Stata file `outfile', and called `s1'_master and `s1'_using"
  disp "WHAT DO DO NEXT"
  di ". use `outfile'"
  di ". masala_review `varlist', s1(`s1') master(`master') using(`using')"
  di "       (or use masala_process, which does the same thing quietly)"
  restore
}
end
/* *********** END program masala_merge ***************************************** */

/**********************************************************************************/
/* program masala_merge2 : Fuzzy match using masalafied levenshtein                   */
/*stata presumably starts with two files, with an identifier and a string to match.
e.g. state_dist_id bank_village_name
     state_dist_id pc01_village_name 


current method.
1. stata outsheets two files, with an id and a name column.
2. python gets filenames from command lines, reads two files into two dictionaries.
2a. add max distance to python command line
3. python outputs a single file, with id, str1, str2, distance
4. stata reads this file and processes

can wrap 1, 4 in a stata calling function. this seems like a good approach.


VERSION HISTORY
- 2013/09/16: renames text1, text2 -> varname_master, varname_using

*/
/***********************************************************************************/
cap prog drop masala_merge2
prog def masala_merge2
{

  syntax varlist using/, S1(string) OUTfile(string) [DIST(integer 5) quietly KEEPUSING(passthru)] 
  // masala_merge2 state_id district_id using /tmp/pn/foo.dta, S1(village_name) OUTfile(string) [DIST(integer 5)]

  /* make everything quiet until python gets called -- it's not helpful */
  qui {

    /* create temporary file to store original dataset */
    tempfile master
    save `master', replace

    /* create a random 4 digit number to make the temp files unique */
    local nonce = floor(uniform() * 10000)
    
    *local src1 $tmp/src1_`nonce'.txt
    *local src2 $tmp/src2_`nonce'.txt
    *local out $tmp/out_`nonce'.txt
    *local lev_groups $tmp/lev_groups_`nonce'.dta

    local src1 "tmp\src1_`nonce'.txt"
    local src2 "tmp\src2_`nonce'.txt"
    local out "tmp\out_`nonce'.txt"
    local lev_groups "tmp\lev_groups_`nonce'.dta"
    
    preserve
    
    keep `varlist' `s1'
    sort `varlist' `s1'
    
    /* merge two datasets on ids to produce group names */
    merge m:m `varlist' using `using', keepusing(`varlist' `s1')
    
    // generate id groups
    egen g = group(`varlist')
    drop if mi(g)
    
    qui sum g
    local num_groups = r(max)
            
    // save group list
    keep g `varlist'
    duplicates drop
    save "`lev_groups'", replace

    /* now prepare group 1 */
    restore
    preserve
    
    keep `varlist' `s1'
  
    /* drop if missing string and store # observations */
    keep if !mi(`s1')
    qui count
    local g1_count = r(N)
    
    /* bring in group identifiers */
    merge m:1 `varlist' using "`lev_groups'", keepusing(g)
  
    /* places with missing ids won't match group */
    drop if _m == 1
  
    /* only keep matches */
    keep if _m == 3
    duplicates drop
    
    // outsheet string group 1
    outsheet g `s1' using "`src1'", comma replace nonames
    
    // prepare group2
    di "opening `using'..."
    use `using', clear
    keep `varlist' `s1'
  
    /* drop if missing string and store # observations */
    keep if !mi(`s1')
    qui count
    local g2_count = r(N)
    
    // merge in group identifiers
    merge m:1 `varlist' using "`lev_groups'", keepusing(g)
    
    /* something wrong if didn't match group ids for any observation */
    drop if _m == 1
  
    /* only keep matches */
    keep if _m == 3
    duplicates drop
    
    // outsheet string group 2
    outsheet g `s1' using "`src2'", comma replace nonames
  }
  
  // call python levenshtein program
  di "Matching `g1_count' strings to `g2_count' strings in `num_groups' groups."
  di "Calling lev.py:"

  di `" shell python -u ${path_code}\merge\lev.py -d `dist' -1 "`src1'" -2 "`src2'" -o "`out'" "'
  !python "${path_code}\merge\lev.py" -d `dist' -1 "`src1'" -2 "`src2'" -o "`out'"

  di "lev.py finished."

  /* quietly process the python output */
  qui {
    /* open output lev dataset */
    /* take care, this generates an error if zero matches */
    capture insheet using "`out'", comma nonames clear

    /* if there are zero matches, create an empty outfile and we're done */
    if _rc {
      disp "WARNING: masala_merge2: There were no matches. Empty output file will be saved."
      clear
      save `outfile', replace emptyok
      exit
    }
    ren v1 g
    ren v2 `s1'_master
    ren v3 `s1'_using
    ren v4 lev_dist
  
    /* merge group identifiers back in */
    destring g, replace force
    merge m:1 g using "`lev_groups'", keepusing(`varlist')

    /* _m == 1 would imply that our match list has groups not in the initial set */
    assert _m != 1
	*drop if _m == 1

    /* _m == 2 are groups with zero matches. drop them */
    drop if _m == 2
  
    /* count specificity of each match */
    bys g `s1'_master: egen master_matches = count(g)
    bys g `s1'_using: egen using_matches = count(g)

    /* count distance to second best match */
  
    /* calculate best match for each var */
   destring lev_dist, replace force
   foreach v in master using {
      bys g `s1'_`v': egen `v'_dist_rank = rank(lev_dist), unique
      gen tmp = lev_dist if `v'_dist_rank == 1
      bys g `s1'_`v': egen `v'_dist_best = max(tmp)
      drop tmp
      gen tmp = lev_dist if `v'_dist_rank == 2
      bys g `s1'_`v': egen `v'_dist_second = max(tmp)
      drop tmp
      drop `v'_dist_rank
    }
    drop g _m

    /* apply optimal matching rule (from pc9101 data in ~/iecmerge/include/calibrate_fuzzy.do) */
    /* initialize */
    gen keep_master = 1
    gen keep_using = 1
  
    /* get mean length of matched string */
    gen length = floor(0.5 * (length(`s1'_master) + length(`s1'_using)))

    /* 1. drop matches with too high a levenshtein distance (threshold is a function of length) */
    replace keep_master = 0 if lev_dist > 0.9 & length <= 4
    replace keep_master = 0 if lev_dist > 1.0 & length <= 5
    replace keep_master = 0 if lev_dist > 1.3 & length <= 8
    replace keep_master = 0 if lev_dist > 1.4 & inrange(length, 9, 14)
    replace keep_master = 0 if lev_dist > 1.8 & inrange(length, 15, 17)
    replace keep_master = 0 if lev_dist > 2.1
    
    /* copy these thresholds to keep_using */
    replace keep_using = 0 if keep_master == 0
  
    /* 2. never use a match that is not the best match */
    replace keep_master = 0 if (lev_dist > master_dist_best) & !mi(lev_dist)
    replace keep_using = 0 if (lev_dist > using_dist_best) & !mi(lev_dist)
    
    /* 3. apply best empirical safety margin rule */
    replace keep_master = 0 if (master_dist_second - master_dist_best) < (0.4 + 0.25 * lev_dist)
    replace keep_using = 0 if (using_dist_second - using_dist_best) < (0.4 + 0.25 * lev_dist)

    /* save over output file */
    order `varlist' `s1'_master `s1'_using lev_dist keep_master keep_using master_* using_*
    save `outfile', replace

  }
  restore

  /* run masala_review */
  use `outfile', clear
  
  /* if quietly is not specified, use masala_review */
  if mi("`quietly'") {
    masala_review `varlist', s1(`s1') master(`master') using(`using')
  }

  /* if quietly was specified, use masala_process */
  else {
    masala_process `varlist', s1(`s1') master(`master') using(`using')
  }
  
  di "Masala merge complete."
  di " Original master file was saved here:   `master'"
  di " Complete set of fuzzy matches is here: `outfile'"
}
end
/* *********** END program masala_merge2 ***************************************** */

/**********************************************************************************/
/* program masala_lev_dist : Calculate levenshtein distance between two vars */
/*                           uses external python program */
/***********************************************************************************/
cap prog drop masala_lev_dist
prog def masala_lev_dist
{
  syntax varlist(min=2 max=2), GEN(name)
  tokenize `varlist'
  foreach i in _masala_word1 _masala_word2 _masala_dist __masala_merge {
    cap drop `i'
  }

  gen _masala_word1 = `1'
  gen _masala_word2 = `2'
  replace _masala_word1 = trim(_masala_word1)
  replace _masala_word2 = trim(_masala_word2)

  gen _row_number = _n
  
  /* create temporary file for python  */
  outsheet _row_number _masala_word1 _masala_word2 using "tmp\masala_in.csv", comma replace nonames

  /* call external python program */
  di "Calling lev.py..."
  shell python "${path_code}\merge\lev.py" -1 "tmp\masala_in.csv" -o "tmp\masala_out.csv"

  /* convert created file to stata format */
  preserve
  insheet using "tmp\masala_out.csv", clear names
  save "tmp\masala_lev_dist", replace
  restore

  /* merge result with new dataset */
  merge 1:1 _row_number using "tmp\masala_lev_dist.dta", gen(__masala_merge) keepusing(_masala_dist)

  /* clean up */
  destring _masala_dist, replace
  ren _masala_dist `gen'
  drop _masala_word1 _masala_word2 _row_number
  
  assert __masala_merge == 3
  drop __masala_merge
}
end
/* *********** END program masala_lev_dist ***************************************** */



/**********************************************************************************/
/* program fix_spelling */
/*

  fixes spelling in a string variable, based on a supplied master list.
  - group() 

  syntax varname(min=1 max=1), [GROUP(varlist)] SRCfile(string) GEN(name)
  fix_spelling pc01_district_name, group(pc01_state_name) src($keys/pc01districtkey) gen(new_district_name)

- need to specify gen() or replace

- take data in data_list, merge it to master list.
- keep if _m == 2

- fuzzy merge data-list to master-list, maybe within some group.
- if a single match without competition, then replace data list with the data in master list

- return new version of data list */
/***********************************************************************************/
cap prog drop fix_spelling
prog def fix_spelling
{
  syntax varname(min=1 max=1),  SRCfile(string) [GEN(name) GROUP(varlist) keepall replace]

  /* need to specify either generate or replace */
  if mi("`gen'") & mi("`replace'") {
    display as error "fix_spelling: Need to specify either generate or replace"
    exit 1
  }
  
  /* can't specify generate AND replace */
  if !mi("`gen'") & !mi("`replace'") {
    display as error "fix_spelling: Need to specify either generate or replace, not both"
    exit 1
  }

  /* if replace is set, create a temp var to be generated */
  if !mi("`replace'") {
    tempvar gen
  }
  
  /* if group is empty, need to group a matser group that is the entire file */
  if mi("`group'") {
    gen __GROUP = 1
    local nogroup = 1
    local group "__GROUP"
  }
  
  /* for now, assume we have a source file */

  /* create the master list */
  preserve
  use "`srcfile'", clear
  if !mi("`nogroup'") gen __GROUP = 1
  keep `group' `varlist'
  duplicates drop
  sort `group' `varlist'
  save "tmp\__SPELLING_MASTER_LIST", replace
  restore
  
  /* create a list of unmatched names */
  preserve

  keep `varlist' `group'
  duplicates drop

  /* get rid of exact matches - these will work well */
  merge 1:1 `group' `varlist' using "tmp\__SPELLING_MASTER_LIST", gen(_merge1)
  keep if _merge1 == 1

  /* if nothing left, then the original list is fine and we're done */
  qui count
  if r(N) == 0 {
    restore
    di "100% of names matched. No fuzzy matching necessary"
    gen `gen' = `varlist'
    exit
  }

  /* otherwise, go to the fuzzy merge */
  masala_merge `group' using "tmp\__SPELLING_MASTER_LIST", s1(`varlist') outfile("tmp\spelling_errors") dist(4)

  /* review masala merge results */
  use "tmp\spelling_errors", clear

  /* exit if no matches */
  count
  if `r(N)' == 0 exit

  /* keep best match for everything in badly-spelled set */
  keep if keep_master == 1
  keep `group' `varlist'_master `varlist'_using lev_dist

  /* fix names and merge back to the original dataset */
  ren `varlist'_master `varlist'
  ren `varlist'_using `gen'
  ren lev_dist `gen'_dist
  save "tmp\__SPELLING_CORRECTIONS", replace
  restore

  /* tag exact matches (this merge only adds _merge_exact) */
  merge m:1 `group' `varlist' using "tmp\__SPELLING_MASTER_LIST", gen(_merge_exact)
  drop if _merge_exact == 2
  
  /* then get fuzzy matches */
  merge m:1 `group' `varlist' using "tmp\__SPELLING_CORRECTIONS", gen(_merge_fuzzy)
  assert _merge_fuzzy != 2

  /* if we have an exact match, shouldn't have a fuzzy match */
  assert _merge_fuzzy == 1 if _merge_exact == 3
  
  /* add exact matches */
  replace `gen' = `varlist' if _merge_exact == 3
  replace `gen'_dist = 0 if _merge_exact == 3
  drop _merge_exact _merge_fuzzy

  /* if keepall specified, get places that didn't match */
  if !mi("`keepall'") {

    /* merge the spell-checked data back to the master list within the group */
    ren `varlist' `varlist'_SP
    ren `gen' `varlist'
    merge m:1 `group' `varlist' using "tmp\__SPELLING_MASTER_LIST", nogen keepusing(`varlist')
    ren `varlist' `gen'
    ren `varlist'_SP `varlist'
  }

  if !mi("`nogroup'") drop __GROUP

  /* if replace was specified */
  if !mi("`replace'") {

    /* show replacements made */
    tempvar tag
    egen `tag' = tag(`varlist') if !mi(`gen') & `gen' != `varlist'
    disp "Spelling fixes and levenshtein distances:"
    list `varlist' `gen' `gen'_dist if `tag'
    
    // replace original var, show what was done, and drop the distance
    replace `varlist' = `gen' if !mi(`gen')
    drop `gen' `gen'_dist
  }
}
end
/* *********** END program fix_spelling ***************************************** */

/**********************************************************************************/
/* program masala_review : Reviews masala_merge results and calls masala_process  */
/***********************************************************************************/
cap prog drop masala_review
prog def masala_review
{
  syntax varlist, s1(string) master(string) using(string) [keepusing(passthru)]

  /* ensure a masala merge output file is open */
  cap confirm var keep_master
  if _rc {
    di "You must open the masala_merge output file before running this program."
  }
  
  /* count and report matches that are exact, but with alternatives */
  /* these are places where keep_master == 0 & lev_dist == 0 */
  qui bys `s1'_master: egen _min_dist = min(lev_dist)
  qui bys `s1'_master: egen _max_keep = max(keep_master)

  qui count if _max_keep == 0 & _min_dist == 0
  if `r(N)' > 0 {
    di "+-------------------------------------" _n "| These are exact matches, where alternate good matches exist." _n ///
      "| keep_master is 0, but masala_process() will keep the observations with lev_dist == 0." _n ///
        "+-------------------------------------" 
    list `varlist' `s1'* lev_dist if _max_keep == 0 & _min_dist == 0
  }
  qui drop _max_keep _min_dist

  /* visually review places with high lev_dist that script kept -- they look good. */
  qui count if keep_master == 1 & lev_dist > 1
  if `r(N)' > 1 {
    disp "These are high cost matches, with no good alternatives. keep_master is 1."
    list `varlist' `s1'* lev_dist if keep_master == 1 & lev_dist > 1
  }

  /* run masala_process, and then show the unmatched places */
  masala_process `varlist', s1(`s1') master(`master') using(`using') `keepusing'

  /* tag each name so it doesn't appear more than once */
  qui egen _ntag = tag(`varlist' `s1')

  /* list unmatched places in a nice order */
  qui gen _matched = _masala_merge == 3
  gsort _matched -_ntag `varlist' `s1'

  /* ensure we don't trigger obs. nos. out of range in final list, by counting observations */
  qui count
  if `r(N)' < 200 {
    local limit `r(N)'
  }
  else {
    local limit 200
  }

  /* list unmatched places */
  qui count if _masala_merge < 3 & _ntag in 1/`limit'
  if `r(N)' {
    disp "This is a sorted list of some places that did not match. Review for ideas on how to improve"
    list `varlist' `s1' _masala_merge if _masala_merge < 3 & _ntag in 1/`limit'
  }

  drop _ntag _matched
}
end
/* *********** END program masala_review ***************************************** */

/**********************************************************************************/
/* program masala_process : Rejoins the initial files in a masala_merge           */
/**********************************************************************************/
cap prog drop masala_process
prog def masala_process
{
  syntax varlist, s1(string) master(string) using(string) [keepusing(passthru)]

  {
    /* override keep_master if lev_dist is zero. */
    replace keep_master = 1 if lev_dist == 0

    /* keep highly reliable matches only */
    keep if keep_master == 1

    /* drop all masala merge's variables */
    keep `varlist' `s1'* lev_dist

    /* bring back master dataset */
    gen `s1' = `s1'_master
    merge m:m `varlist' `s1' using `master', gen(_masala_master)

    /* fill in master fuzzy-string from unmatched data on master side */
    replace `s1'_master = `s1' if mi(`s1'_master)
    drop `s1'

    /* bring back using dataset */
    gen `s1' = `s1'_using
    merge m:m `varlist' `s1' using `using', `keepusing' gen(_masala_using)

    /* fill in using fuzzy-string from unmatched data on using side  */
    replace `s1'_using = `s1' if mi(`s1'_using)
    drop `s1'

    /* set `s1' to the master value */
    ren `s1'_master `s1'

    /* fill in using values when _m == 2 */
    replace `s1' = `s1'_using if mi(`s1')
  }

  /* Assertion: if we couldn't match back to the using, it must be unmatched from the master side */
  assert _masala_master == 2 if _masala_using == 1

  /* show merge result */
  disp "Results of masala_merge (counting unique strings only): "
  
  /* tag each name so it doesn't appear more than once */
  qui egen ntag = tag(`varlist' `s1')

  /* create a standard merge output variable */
  qui gen _masala_merge = 1 if _masala_master == 2
  qui replace _masala_merge = 2 if _masala_using == 2
  qui replace _masala_merge = 3 if _masala_using == 3 & _masala_master == 3
  drop _masala_master _masala_using
  label values _masala_merge _merge

  /* show results */
  table _masala_merge if ntag
  qui drop ntag
}
end
/* *********** END program masala_process ***************************************** */

/**********************************************************************************/
/* program review_merge : call right after a merge to review potential matches    */
/***********************************************************************************/
cap prog drop review_merge
prog def review_merge
{
  syntax varlist, [merge(string)]

  if mi("`merge'") {
    local merge _merge
  }

  sort `varlist'
  list `varlist' `merge' if `merge' < 3
}
end
/* *********** END program review_merge ***************************************** */

/***********************************************************************************/
/* program create_merge_fragments : Call right after a merge to create separate
                                    files of the unmatched pieces                  */
/***********************************************************************************/
cap prog drop create_merge_fragments
prog def create_merge_fragments
{

  /* idea is we had:
 file1: merge_vars a b c
 file2: merge_vars d e f

we want to create file1 and file2 leftovers. hard part is getting the variables right.

syntax option:
- call with the completed merge file open, pass original master() and using() files back in.

  */
  syntax anything, master(string) using(string) [merge(string) suffix(string)]

  /* set default values for merge and suffix locals */
  if mi("`merge'") local merge _merge
  if mi("`suffix'") local suffix unmatched
  
  /* hack work to get m:1, 1:1, or 1:m */
  local merge_type = substr("`anything'", 1, 3)

  if !inlist("`merge_type'", "1:1", "m:1", "1:m") {
    di "Must specify 1:1, m:1 or 1:m as in merge syntax"
    barf
  }
  local master_type = substr("`merge_type'", 1, 1)
  local using_type = substr("`merge_type'", 3, 1)
  
  local varlist = substr("`anything'", 4, .)
  
  /* confirm varlist is a varlist */
  confirm var `varlist'

  /* we want to leave this unaltered, so wrap everything in a preserve / restore */
  preserve

  /* keep only the matches and drop _merge */
  keep if `merge' == 3
  drop `merge'
  
  /* save the file with the matches. all we need is the varlist */
  keep `varlist'

  /* we only need one copy of each match, this allows everything below to be m:1 */
  duplicates drop
  tempfile merge3
  save `merge3', replace
  
  /* create master only file */
  use `master', clear

  /* merge it to list of matches */
  /* 1:m is con->village. merged file will have many repeated cons */
  /* m:1 is village->con. merged file will have each village once */
  /* 1:1 obviously has each side once */
  merge 1:`using_type' `varlist' using `merge3'

  /* now we want to keep only the non-matches */
  keep if `merge' == 1

  /* there should not be any using, if we just ran this merge */
  assert `merge' != 2

  /* drop _merge and save fragment file */
  drop `merge'
  save `master'_`suffix', replace

  /* repeat process for using side */
  use `using', clear

  /* merge it to list of matches */
  /* 1:m is con->village. merged file will have each village once */
  /* m:1 is village->con. merged file will have cons repeated */
  /* 1:1 obviously has each side once */
  merge `master_type':1 `varlist' using `merge3'

  /* now we want to keep only the non-matches */
  keep if `merge' == 1

  /* there should not be any using, if we just ran this merge */
  assert `merge' != 2

  /* drop _merge and save fragment file */
  drop `merge'
  save `using'_`suffix', replace
  
  restore

  /* report what happened */
  di "Created files with merge fragments:"
  di "  master: `master'_`suffix'"
  di "  using: `using'_`suffix'"
}
end
/* *********** END program create_merge_fragments ***************************************** */

}
